/* panama_x.c */

/**************************************************************************+
*
*  PANAMA high-performance reference C-code, based on the description in
*  the paper 'Fast Hashing and Stream Encryption with PANAMA', presented
*  at the Fast Software Encryption Workshop, Paris, 1998, see "Fast
*  Software Encryption - 5th International Workshop, FSE'98", edited by
*  Serge Vaudenay, LNCS-1372, Springer-Verlag, 1998, pp 60-74, also
*  available on-line at http://standard.pictel.com/ftp/research/security
*
*  Algorithm design by Joan Daemen and Craig Clapp
*
*  panama_x.c  -  Core routines for the Panama stream/hash module, this
*                 exportable version excludes an encryption routine.
*
*
*  History:
*
*  29-Oct-98  Craig Clapp  Implemention for Dr. Dobbs, Dec. 1998 issue,
*                          based on earlier performance-benchmark code.
*
*
*  Notes:  This code is supplied for the purposes of evaluating the
*          performance of the Panama stream/hash module and as a
*          reference implementation for generating test vectors for
*          compatibility / interoperability verification.
*
*
+**************************************************************************/

#include <assert.h>
#include "panama.h"


/**************************************************************************+
*                         Panama internal routines                         *
+**************************************************************************/

/* tau, rotate  word 'a' to the left by rol_bits bit positions */

#define tau(a, rol_bits)  ROTL32(a, rol_bits)

/**************************************************************************/

/* move state between memory and local registers */

#define READ_STATE_i(i)   state_##i = state->word[i]
#define WRITE_STATE_i(i)  state->word[i] = state_##i


#define READ_STATE    \
                      \
    READ_STATE_i(0);  \
    READ_STATE_i(1);  \
    READ_STATE_i(2);  \
    READ_STATE_i(3);  \
    READ_STATE_i(4);  \
    READ_STATE_i(5);  \
    READ_STATE_i(6);  \
    READ_STATE_i(7);  \
    READ_STATE_i(8);  \
    READ_STATE_i(9);  \
    READ_STATE_i(10); \
    READ_STATE_i(11); \
    READ_STATE_i(12); \
    READ_STATE_i(13); \
    READ_STATE_i(14); \
    READ_STATE_i(15); \
    READ_STATE_i(16)


#define WRITE_STATE    \
                       \
    WRITE_STATE_i(0);  \
    WRITE_STATE_i(1);  \
    WRITE_STATE_i(2);  \
    WRITE_STATE_i(3);  \
    WRITE_STATE_i(4);  \
    WRITE_STATE_i(5);  \
    WRITE_STATE_i(6);  \
    WRITE_STATE_i(7);  \
    WRITE_STATE_i(8);  \
    WRITE_STATE_i(9);  \
    WRITE_STATE_i(10); \
    WRITE_STATE_i(11); \
    WRITE_STATE_i(12); \
    WRITE_STATE_i(13); \
    WRITE_STATE_i(14); \
    WRITE_STATE_i(15); \
    WRITE_STATE_i(16)

/**************************************************************************/

/* gamma, shift-invariant transformation a[i] XOR (a[i+1] OR NOT a[i+2]) */

#define gamma_in_(i)   state_##i
#define gamma_out_(i)  gamma_##i

#define GAMMA_i(i, i_plus_1, i_plus_2)  \
                                        \
    gamma_out_(i) = gamma_in_(i) ^ (gamma_in_(i_plus_1) | ~gamma_in_(i_plus_2))


#define GAMMA            \
                         \
    GAMMA_i( 0,  1,  2); \
    GAMMA_i( 1,  2,  3); \
    GAMMA_i( 2,  3,  4); \
    GAMMA_i( 3,  4,  5); \
    GAMMA_i( 4,  5,  6); \
    GAMMA_i( 5,  6,  7); \
    GAMMA_i( 6,  7,  8); \
    GAMMA_i( 7,  8,  9); \
    GAMMA_i( 8,  9, 10); \
    GAMMA_i( 9, 10, 11); \
    GAMMA_i(10, 11, 12); \
    GAMMA_i(11, 12, 13); \
    GAMMA_i(12, 13, 14); \
    GAMMA_i(13, 14, 15); \
    GAMMA_i(14, 15, 16); \
    GAMMA_i(15, 16,  0); \
    GAMMA_i(16,  0,  1)

/**************************************************************************/

/* pi, permute and cyclicly rotate the state words */

#define pi_in_(i)   gamma_##i
#define pi_out_(i)  pi_##i

#define PI_i(i, j, k)  pi_out_(i) = tau(pi_in_(j), k)


#define PI                  \
                            \
    pi_out_(0) = pi_in_(0); \
    PI_i( 1,  7,  1);       \
    PI_i( 2, 14,  3);       \
    PI_i( 3,  4,  6);       \
    PI_i( 4, 11, 10);       \
    PI_i( 5,  1, 15);       \
    PI_i( 6,  8, 21);       \
    PI_i( 7, 15, 28);       \
    PI_i( 8,  5,  4);       \
    PI_i( 9, 12, 13);       \
    PI_i(10,  2, 23);       \
    PI_i(11,  9,  2);       \
    PI_i(12, 16, 14);       \
    PI_i(13,  6, 27);       \
    PI_i(14, 13,  9);       \
    PI_i(15,  3, 24);       \
    PI_i(16, 10,  8)

/**************************************************************************/

/* theta, shift-invariant transformation a[i] XOR a[i+1] XOR a[i+4] */

#define theta_in_(i)   pi_##i
#define theta_out_(i)  theta_##i

#define THETA_i(i, i_plus_1, i_plus_4)  \
                                        \
    theta_out_(i) = theta_in_(i) ^ theta_in_(i_plus_1) ^ theta_in_(i_plus_4)


#define THETA            \
                         \
    THETA_i( 0,  1,  4); \
    THETA_i( 1,  2,  5); \
    THETA_i( 2,  3,  6); \
    THETA_i( 3,  4,  7); \
    THETA_i( 4,  5,  8); \
    THETA_i( 5,  6,  9); \
    THETA_i( 6,  7, 10); \
    THETA_i( 7,  8, 11); \
    THETA_i( 8,  9, 12); \
    THETA_i( 9, 10, 13); \
    THETA_i(10, 11, 14); \
    THETA_i(11, 12, 15); \
    THETA_i(12, 13, 16); \
    THETA_i(13, 14,  0); \
    THETA_i(14, 15,  1); \
    THETA_i(15, 16,  2); \
    THETA_i(16,  0,  3)

/**************************************************************************/

/* sigma, merge two buffer stages with current state */

#define sigma_in_(i)   theta_##i
#define sigma_out_(i)  state_##i

#define SIGMA_L_i(i)  sigma_out_(i) = sigma_in_(i) ^ L->word[i-1]
#define SIGMA_B_i(i)  sigma_out_(i) = sigma_in_(i) ^ b->word[i-9]


#define SIGMA      \
                   \
    sigma_out_(0) = sigma_in_(0) ^ 0x00000001L; \
                   \
    SIGMA_L_i(1);  \
    SIGMA_L_i(2);  \
    SIGMA_L_i(3);  \
    SIGMA_L_i(4);  \
    SIGMA_L_i(5);  \
    SIGMA_L_i(6);  \
    SIGMA_L_i(7);  \
    SIGMA_L_i(8);  \
                   \
    SIGMA_B_i(9);  \
    SIGMA_B_i(10); \
    SIGMA_B_i(11); \
    SIGMA_B_i(12); \
    SIGMA_B_i(13); \
    SIGMA_B_i(14); \
    SIGMA_B_i(15); \
    SIGMA_B_i(16)

/**************************************************************************/

/* lambda, update the 256-bit wide by 32-stage LFSR buffer */

#define LAMBDA_25_i(i)  \
  ptap_25->word[i] = ptap_25->word[i] ^ ptap_0->word[(i+2) & (PAN_STAGE_SIZE-1)]

#define LAMBDA_0_i(i, source)  ptap_0->word[i] = source ^ ptap_0->word[i]


#define LAMBDA_25_UPDATE \
                         \
    LAMBDA_25_i(0);      \
    LAMBDA_25_i(1);      \
    LAMBDA_25_i(2);      \
    LAMBDA_25_i(3);      \
    LAMBDA_25_i(4);      \
    LAMBDA_25_i(5);      \
    LAMBDA_25_i(6);      \
    LAMBDA_25_i(7)

#define LAMBDA_0_PULL       \
                            \
    LAMBDA_0_i(0, state_1); \
    LAMBDA_0_i(1, state_2); \
    LAMBDA_0_i(2, state_3); \
    LAMBDA_0_i(3, state_4); \
    LAMBDA_0_i(4, state_5); \
    LAMBDA_0_i(5, state_6); \
    LAMBDA_0_i(6, state_7); \
    LAMBDA_0_i(7, state_8)

#define LAMBDA_0_PUSH          \
                               \
    LAMBDA_0_i(0, L->word[0]); \
    LAMBDA_0_i(1, L->word[1]); \
    LAMBDA_0_i(2, L->word[2]); \
    LAMBDA_0_i(3, L->word[3]); \
    LAMBDA_0_i(4, L->word[4]); \
    LAMBDA_0_i(5, L->word[5]); \
    LAMBDA_0_i(6, L->word[6]); \
    LAMBDA_0_i(7, L->word[7])

/* avoid temporary register for tap 31 by finishing updating tap 25 before updating tap 0 */
#define LAMBDA_PULL   \
    LAMBDA_25_UPDATE; \
    LAMBDA_0_PULL

#define LAMBDA_PUSH   \
    LAMBDA_25_UPDATE; \
    LAMBDA_0_PUSH

/**************************************************************************/

#define regs(i)  state_##i, gamma_##i, pi_##i, theta_##i

/**************************************************************************/




/**************************************************************************+
*                         Panama external routines                         *
+**************************************************************************/


/**************************************************************************+
*
*  pan_pull() - Performs multiple iterations of the Panama 'Pull' operation.
*               The input and output arrays are treated as integer multiples
*               of Panama's natural 256-bit block size.
*
*               Input and output arrays may be disjoint or coincident but
*               may not be overlapped if offset from one another.
*
*               If 'In' is a NULL pointer then output is taken direct from
*               the state machine (used for hash output). If 'Out' is a NULL
*               pointer then a dummy 'Pull' is performed. Otherwise 'In' is
*               XOR combined with the state machine to produce 'Out'
*               (used for stream encryption / decryption).
*
+**************************************************************************/

void pan_pull(UINT32     *restrict In,      /* input array                   */
              UINT32     *restrict Out,     /* output array                  */
              long        pan_blocks,       /* number of blocks to be Pulled */
              PAN_BUFFER *restrict buffer,  /* LFSR buffer                   */
              PAN_STATE  *restrict state)   /* 17-word finite-state machine  */
{
    long i;

    UINT32 regs(0),  regs(1),  regs(2),  regs(3),  regs(4);
    UINT32 regs(5),  regs(6),  regs(7),  regs(8),  regs(9);
    UINT32 regs(10), regs(11), regs(12), regs(13), regs(14);
    UINT32 regs(15), regs(16);

    int tap_0;
    PAN_STAGE *restrict ptap_0, *restrict ptap_25;
    PAN_STAGE *restrict L, *restrict b;

    /* configure routine according to which PULL mode is intended */
    static UINT32 null_in[PAN_STAGE_SIZE] = {0,0,0,0,0,0,0,0};
    UINT32 dummy_out[PAN_STAGE_SIZE];
    int in_step, out_step;

    in_step = out_step = PAN_STAGE_SIZE;

    if (In == NULL || Out == NULL)
    {
        In = null_in;
        in_step = 0;
    }

    if (Out == NULL)
    {
        Out = dummy_out;
        out_step = 0;
    }

    /* copy buffer pointers and state to registers */
    tap_0 = buffer->tap_0;
    READ_STATE;

    /* rho, cascade of state update operations */

    for (i = 0; i < pan_blocks; i++)
    {
        /* apply state output to crypto buffer */
        Out[0] = In[0] ^ gamma_in_(9);
        Out[1] = In[1] ^ gamma_in_(10);
        Out[2] = In[2] ^ gamma_in_(11);
        Out[3] = In[3] ^ gamma_in_(12);
        Out[4] = In[4] ^ gamma_in_(13);
        Out[5] = In[5] ^ gamma_in_(14);
        Out[6] = In[6] ^ gamma_in_(15);
        Out[7] = In[7] ^ gamma_in_(16);

        Out += out_step;
        In  += in_step;

        GAMMA;  /* perform non-linearity stage */

        PI;     /* perform bit-dispersion stage */

        THETA;  /* perform diffusion stage */

        /* calculate pointers to taps 4 and 16 for sigma based on current position of tap 0 */
        L = &buffer->stage[(tap_0 +  4) & (PAN_STAGES - 1)];
        b = &buffer->stage[(tap_0 + 16) & (PAN_STAGES - 1)];

        /* move tap_0 left by one stage, equivalent to shifting LFSR one stage right */
        tap_0   = (tap_0 - 1) & (PAN_STAGES - 1);

        /* set tap pointers for use by lambda */
        ptap_0  = &buffer->stage[tap_0];
        ptap_25 = &buffer->stage[(tap_0 + 25) & (PAN_STAGES - 1)];

        LAMBDA_PULL;  /* update the LFSR buffer */

        /* postpone sigma until after lambda in order to avoid extra temporaries for feedback path */
        /* note that sigma gets to use the old positions of taps 4 and 16 */

        SIGMA;  /* perform buffer injection stage */
    }

    /* write buffer pointer and state back to memory */
    buffer->tap_0 = tap_0;
    WRITE_STATE;
}


/**************************************************************************+
*
*  pan_push() - Performs multiple iterations of the Panama 'Push' operation.
*               The input array is treated as an integer multiple of the
*               256-bit blocks which are Panama's natural input size.
*
+**************************************************************************/

void pan_push(UINT32     *restrict In,      /* input array                   */
              long        pan_blocks,       /* number of blocks to be Pushed */
              PAN_BUFFER *restrict buffer,  /* LFSR buffer                   */
              PAN_STATE  *restrict state)   /* 17-word finite-state machine  */
{
    long i;

    UINT32 regs(0),  regs(1),  regs(2),  regs(3),  regs(4);
    UINT32 regs(5),  regs(6),  regs(7),  regs(8),  regs(9);
    UINT32 regs(10), regs(11), regs(12), regs(13), regs(14);
    UINT32 regs(15), regs(16);

    int tap_0;
    PAN_STAGE *restrict ptap_0, *restrict ptap_25;
    PAN_STAGE *restrict L, *restrict b;

    /* copy buffer pointers and state to registers */
    tap_0 = buffer->tap_0;
    READ_STATE;

    assert((UINT32*)((PAN_STAGE*)In) == In);
    L = (PAN_STAGE*)In; /* we assume pointer to input buffer is compatible with pointer to PAN_STAGE */

    /* rho, cascade of state update operations */

    for (i = 0; i < pan_blocks; i++)
    {
        GAMMA;  /* perform non-linearity stage */

        PI;     /* perform bit-dispersion stage */

        THETA;  /* perform diffusion stage */


        /* calculate pointer to tap 16 for sigma based on current position of tap 0 */
        b = &buffer->stage[(tap_0 + 16) & (PAN_STAGES - 1)];

        /* move tap_0 left by one stage, equivalent to shifting LFSR one stage right */
        tap_0   = (tap_0 - 1) & (PAN_STAGES - 1);

        /* set tap pointers for use by lambda */
        ptap_0  = &buffer->stage[tap_0];
        ptap_25 = &buffer->stage[(tap_0 + 25) & (PAN_STAGES - 1)];

        LAMBDA_PUSH;  /* update the LFSR buffer */

        /* postpone sigma until after lambda in order to avoid extra temporaries for feedback path */
        /* note that sigma gets to use the old positions of taps 4 and 16 */

        SIGMA;  /* perform buffer injection stage */

        L++; /* In += PAN_STAGE_SIZE; */
    }

    /* write buffer pointer and state back to memory */
    buffer->tap_0 = tap_0;
    WRITE_STATE;
}


/**************************************************************************+
*
*  pan_reset() - Initializes an LFSR buffer and Panama state machine to
*                all zeros, ready for a new hash to be accumulated or to
*                re-synchronize or start up an encryption key-stream.
*
+**************************************************************************/

void pan_reset(PAN_BUFFER *buffer, PAN_STATE *state)
{
    int i, j;

    buffer->tap_0 = 0;

    for (j = 0; j < PAN_STAGES; j++)
    {
        for (i = 0; i < PAN_STAGE_SIZE; i++)
        {
            buffer->stage[j].word[i] = 0L;
        }
    }

    for (i = 0; i < PAN_STATE_SIZE; i++)
    {
        state->word[i] = 0L;
    }
}


/**************************************************************************+
*
*  pan_hash() - Performs hashing over the input array, which is treated as
*               a little-endian bit-field of length specified by bitlength.
*
+**************************************************************************/

void pan_hash(UINT32  *sourcetext,  /* input array                  */
              UINT32  *hashcode,    /* 256-bit hash result          */
              long     bitlength)   /* length to be hashed, in bits */
{
    int i;
    long pan_blocks, rembits;
    UINT32 little_endian_remnant[PAN_STAGE_SIZE] = {0,0,0,0,0,0,0,0};

    PAN_BUFFER buffer;
    PAN_STATE state;

    /* initialize the Panama state machine for a fresh hashing operation */
    pan_reset(&buffer, &state);

    /* divide the source array into full 256-bit blocks and a remnant */
    pan_blocks = bitlength / (PAN_STAGE_SIZE * WORDLENGTH);
    rembits = bitlength - pan_blocks * (PAN_STAGE_SIZE * WORDLENGTH);

    /* perform hashing operation on sourcetext array except for remnant */
    if (pan_blocks > 0)
    {
        pan_push(sourcetext, pan_blocks, &buffer, &state);
    }

    /* copy remnant bits to temporary array and append '1' */
    for (i = 0; i < (rembits + WORDLENGTH - 1) / WORDLENGTH; i++)
    {
        little_endian_remnant[i] = *(sourcetext + pan_blocks * PAN_STAGE_SIZE + i);
    }
    little_endian_remnant[rembits / WORDLENGTH] &= ~(~0L << (rembits % WORDLENGTH));
    little_endian_remnant[rembits / WORDLENGTH] |= 1L << (rembits % WORDLENGTH);

    /* operate on final remnant, padded out to 256-bit block */
    pan_push(little_endian_remnant, 1, &buffer, &state);

    /* perform 32 dummy PULL operations */
    pan_pull(NULL, NULL, 32, &buffer, &state);

    /* perform final PULL operation */
    pan_pull(NULL, hashcode, 1, &buffer, &state);

    /* 256-bit result is returned through *hashcode */
}


/**************************************************************************+
*
*  pan_crypt() - Performs stream encryption or decryption.
*
+**************************************************************************/

void pan_crypt(UINT32    *source_buf,  /* input array                         */
               UINT32    *dest_buf,    /* output array                        */
               PAN_STAGE *key,         /* 256-bit (max) key                   */
               PAN_STAGE *init_vec,    /* 256-bit (max) initialization vector */
               long       bitlength)   /* length to be encrypted, in bits     */
{
  /*
    Due to U.S. government export restrictions on machine-readable
    computer code for performing encryption, no implementation of
    the Panama encryption function is included in this distribution.
  */
}

/**************************************************************************/
